library(MASS)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.6 ✓ dplyr 1.0.4
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::select() masks MASS::select()
mydata<- MASS::Boston
#attach(Boston)
#mydata
**Number of observations and columns
nrow(mydata)
## [1] 506
ncol(mydata)
## [1] 14
** Structure of the data
str(mydata)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
** Top and bottom of the data
head(mydata)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
tail(mydata)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 501 0.22438 0 9.69 0 0.585 6.027 79.7 2.4982 6 391 19.2 396.90 14.33
## 502 0.06263 0 11.93 0 0.573 6.593 69.1 2.4786 1 273 21.0 391.99 9.67
## 503 0.04527 0 11.93 0 0.573 6.120 76.7 2.2875 1 273 21.0 396.90 9.08
## 504 0.06076 0 11.93 0 0.573 6.976 91.0 2.1675 1 273 21.0 396.90 5.64
## 505 0.10959 0 11.93 0 0.573 6.794 89.3 2.3889 1 273 21.0 393.45 6.48
## 506 0.04741 0 11.93 0 0.573 6.030 80.8 2.5050 1 273 21.0 396.90 7.88
## medv
## 501 16.8
## 502 22.4
## 503 20.6
## 504 23.9
## 505 22.0
## 506 11.9
** Basic visualization
plot(crim ~ rad, data = mydata, col = "dodgerblue", pch = 20, cex = 1.5,
main = "Boston: Crim vs Rad")
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
library(corrplot)
## corrplot 0.84 loaded
corr <- cor(Boston)
corrplot(corr,type = "full")
** boxplot on a few variables
par(mfrow= c(2,2))
boxplot(mydata$rad, main="Index of Access to Radial Highways",col = "red")
boxplot(mydata$zn, main="Landzone",col = "yellow")
boxplot(mydata$medv, main="Median val of occupies homes",col="green")
boxplot(mydata$dis,main="Mean of distances between employment centres",col = "blue")
scatter plot
ggplot(Boston,aes(x=dis, y=crim))+ geom_point()+labs(title = 'Dis')
summary(mydata$crim)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00632 0.08204 0.25651 3.61352 3.67708 88.97620
require(ggplot2)
require(plotly)
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(data = Boston, x = ~tax, y = ~crim)
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
featurePlot(x= mydata[,c("dis","tax","zn","rad","black","lstat","ptratio","medv")],y= mydata$crim)
pairs(Boston)
library(ggcorrplot)
corr=round(cor(Boston),1)
ggcorrplot(corr,hc.order= TRUE,lab= TRUE)
#Model1
mod1 <- lm(crim~.,Boston)
par(mfrow=c(2,2),mar=c(4,4,2,0.5))
plot(mod1)
summary(mod1)
##
## Call:
## lm(formula = crim ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.924 -2.120 -0.353 1.019 75.051
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.033228 7.234903 2.354 0.018949 *
## zn 0.044855 0.018734 2.394 0.017025 *
## indus -0.063855 0.083407 -0.766 0.444294
## chas -0.749134 1.180147 -0.635 0.525867
## nox -10.313535 5.275536 -1.955 0.051152 .
## rm 0.430131 0.612830 0.702 0.483089
## age 0.001452 0.017925 0.081 0.935488
## dis -0.987176 0.281817 -3.503 0.000502 ***
## rad 0.588209 0.088049 6.680 6.46e-11 ***
## tax -0.003780 0.005156 -0.733 0.463793
## ptratio -0.271081 0.186450 -1.454 0.146611
## black -0.007538 0.003673 -2.052 0.040702 *
## lstat 0.126211 0.075725 1.667 0.096208 .
## medv -0.198887 0.060516 -3.287 0.001087 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.439 on 492 degrees of freedom
## Multiple R-squared: 0.454, Adjusted R-squared: 0.4396
## F-statistic: 31.47 on 13 and 492 DF, p-value: < 2.2e-16
shapiro.test(mod1$residuals)
##
## Shapiro-Wilk normality test
##
## data: mod1$residuals
## W = 0.50435, p-value < 2.2e-16
#Model2
mod2 <- lm(crim~dis +rad +medv, mydata)
par(mfrow=c(2,2),mar=c(4,4,2,0.5))
plot(mod2)
summary(mod2)
##
## Call:
## lm(formula = crim ~ dis + rad + medv, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.010 -1.785 -0.502 0.869 75.223
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.49320 1.23555 2.827 0.00488 **
## dis -0.32412 0.15992 -2.027 0.04321 *
## rad 0.51528 0.04051 12.719 < 2e-16 ***
## medv -0.15844 0.03443 -4.602 5.3e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.558 on 502 degrees of freedom
## Multiple R-squared: 0.4222, Adjusted R-squared: 0.4187
## F-statistic: 122.3 on 3 and 502 DF, p-value: < 2.2e-16
m1<-boxcox(crim~dis+rad+medv,data=mydata)
b1<- m1$x[which.max(m1$y)]
mydata$y<- ((mydata$crim)^b1-1/b1)
bcm1<-lm(y~dis + rad + medv,data= mydata)
summary(bcm1)
##
## Call:
## lm(formula = y ~ dis + rad + medv, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.133706 -0.038786 -0.004835 0.032465 0.142020
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.552e+01 9.622e-03 -1612.846 < 2e-16 ***
## dis -1.905e-02 1.245e-03 -15.298 < 2e-16 ***
## rad 9.880e-03 3.155e-04 31.314 < 2e-16 ***
## medv -1.795e-03 2.681e-04 -6.696 5.74e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05107 on 502 degrees of freedom
## Multiple R-squared: 0.8448, Adjusted R-squared: 0.8439
## F-statistic: 910.7 on 3 and 502 DF, p-value: < 2.2e-16
plot(bcm1)
shapiro.test(bcm1$residuals)
##
## Shapiro-Wilk normality test
##
## data: bcm1$residuals
## W = 0.98073, p-value = 3.077e-06
** Shapiro test
for (i in 1:ncol(mydata)){
shapiro.test(mydata[,i])$p.value %>%print()}
## [1] 1.328589e-36
## [1] 7.88283e-34
## [1] 1.064699e-17
## [1] 2.350507e-40
## [1] 5.776224e-14
## [1] 2.411977e-10
## [1] 2.230765e-18
## [1] 2.18539e-17
## [1] 8.073213e-30
## [1] 1.162781e-23
## [1] 2.360649e-17
## [1] 6.05766e-36
## [1] 8.286632e-14
## [1] 4.941386e-16
## [1] 7.734451e-15
step.model <- stepAIC(mod1,direction = "backward",trace = TRUE)
## Start: AIC=1898.56
## crim ~ zn + indus + chas + nox + rm + age + dis + rad + tax +
## ptratio + black + lstat + medv
##
## Df Sum of Sq RSS AIC
## - age 1 0.27 20400 1896.6
## - chas 1 16.71 20417 1897.0
## - rm 1 20.43 20420 1897.1
## - tax 1 22.29 20422 1897.1
## - indus 1 24.30 20424 1897.2
## <none> 20400 1898.6
## - ptratio 1 87.65 20488 1898.7
## - lstat 1 115.18 20515 1899.4
## - nox 1 158.47 20558 1900.5
## - black 1 174.58 20574 1900.9
## - zn 1 237.70 20638 1902.4
## - medv 1 447.85 20848 1907.5
## - dis 1 508.77 20909 1909.0
## - rad 1 1850.44 22250 1940.5
##
## Step: AIC=1896.56
## crim ~ zn + indus + chas + nox + rm + dis + rad + tax + ptratio +
## black + lstat + medv
##
## Df Sum of Sq RSS AIC
## - chas 1 16.54 20417 1895.0
## - rm 1 22.14 20422 1895.1
## - tax 1 22.16 20422 1895.1
## - indus 1 24.30 20424 1895.2
## <none> 20400 1896.6
## - ptratio 1 87.41 20488 1896.7
## - lstat 1 131.43 20532 1897.8
## - nox 1 166.37 20567 1898.7
## - black 1 174.40 20575 1898.9
## - zn 1 239.21 20639 1900.5
## - medv 1 447.81 20848 1905.5
## - dis 1 559.06 20959 1908.2
## - rad 1 1857.98 22258 1938.7
##
## Step: AIC=1894.97
## crim ~ zn + indus + nox + rm + dis + rad + tax + ptratio + black +
## lstat + medv
##
## Df Sum of Sq RSS AIC
## - tax 1 18.81 20436 1893.4
## - rm 1 22.76 20440 1893.5
## - indus 1 28.82 20446 1893.7
## <none> 20417 1895.0
## - ptratio 1 84.57 20501 1895.1
## - lstat 1 129.63 20546 1896.2
## - nox 1 175.96 20593 1897.3
## - black 1 178.37 20595 1897.4
## - zn 1 241.26 20658 1898.9
## - medv 1 483.38 20900 1904.8
## - dis 1 563.37 20980 1906.8
## - rad 1 1842.82 22260 1936.7
##
## Step: AIC=1893.44
## crim ~ zn + indus + nox + rm + dis + rad + ptratio + black +
## lstat + medv
##
## Df Sum of Sq RSS AIC
## - rm 1 23.0 20459 1892.0
## - indus 1 64.4 20500 1893.0
## <none> 20436 1893.4
## - ptratio 1 87.4 20523 1893.6
## - lstat 1 137.9 20574 1894.8
## - black 1 178.1 20614 1895.8
## - nox 1 181.9 20617 1895.9
## - zn 1 222.9 20658 1896.9
## - medv 1 465.3 20901 1902.8
## - dis 1 556.9 20992 1905.0
## - rad 1 4693.4 25129 1996.0
##
## Step: AIC=1892.01
## crim ~ zn + indus + nox + dis + rad + ptratio + black + lstat +
## medv
##
## Df Sum of Sq RSS AIC
## - indus 1 74.0 20533 1891.8
## <none> 20459 1892.0
## - ptratio 1 88.2 20547 1892.2
## - lstat 1 118.9 20577 1892.9
## - nox 1 176.9 20636 1894.4
## - black 1 202.4 20661 1895.0
## - zn 1 233.9 20692 1895.8
## - medv 1 458.7 20917 1901.2
## - dis 1 572.2 21031 1904.0
## - rad 1 4811.3 25270 1996.9
##
## Step: AIC=1891.83
## crim ~ zn + nox + dis + rad + ptratio + black + lstat + medv
##
## Df Sum of Sq RSS AIC
## <none> 20533 1891.8
## - lstat 1 104.7 20637 1892.4
## - ptratio 1 119.0 20652 1892.8
## - black 1 198.4 20731 1894.7
## - zn 1 239.6 20772 1895.7
## - nox 1 296.6 20829 1897.1
## - medv 1 430.2 20963 1900.3
## - dis 1 507.8 21040 1902.2
## - rad 1 4739.5 25272 1994.9
summary(step.model)
##
## Call:
## lm(formula = crim ~ zn + nox + dis + rad + ptratio + black +
## lstat + medv, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.860 -2.102 -0.363 0.895 75.702
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.683128 6.086010 3.234 0.001301 **
## zn 0.043293 0.017977 2.408 0.016394 *
## nox -12.753708 4.760157 -2.679 0.007623 **
## dis -0.918318 0.261932 -3.506 0.000496 ***
## rad 0.532617 0.049727 10.711 < 2e-16 ***
## ptratio -0.310541 0.182941 -1.697 0.090229 .
## black -0.007922 0.003615 -2.191 0.028897 *
## lstat 0.110173 0.069219 1.592 0.112097
## medv -0.174207 0.053988 -3.227 0.001334 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.428 on 497 degrees of freedom
## Multiple R-squared: 0.4505, Adjusted R-squared: 0.4416
## F-statistic: 50.92 on 8 and 497 DF, p-value: < 2.2e-16
broom::glance(bcm1)%>% dplyr::select(AIC,BIC)
## # A tibble: 1 x 2
## AIC BIC
## <dbl> <dbl>
## 1 -1568. -1547.
test_data <- sample_n(mydata,5)
test_data
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.16760 0 7.38 0 0.493 6.426 52.3 4.5404 5 287 19.6 396.90 7.20
## 2 0.88125 0 21.89 0 0.624 5.637 94.7 1.9799 4 437 21.2 396.90 18.34
## 3 0.13158 0 10.01 0 0.547 6.176 72.5 2.7301 6 432 17.8 393.30 12.04
## 4 1.35472 0 8.14 0 0.538 6.072 100.0 4.1750 4 307 21.0 376.73 13.04
## 5 0.06211 40 1.25 0 0.429 6.490 44.4 8.7921 1 335 19.7 396.90 5.98
## medv y
## 1 23.8 -15.60260
## 2 14.3 -15.50763
## 3 21.2 -15.61566
## 4 14.5 -15.48143
## 5 22.9 -15.65500
distPred <- predict(bcm1,test_data,interval = "confidence",level=0.9)
distPred <- (b1*distPred+1)**(1/b1)
actuals_preds <- data.frame(cbind(actuals=test_data$crim, predicteds=distPred))
actuals_preds
## actuals fit lwr upr
## 1 0.16760 1.452904e-21 1.343790e-21 1.570299e-21
## 2 0.88125 3.927925e-21 3.421960e-21 4.503550e-21
## 3 0.13158 3.484378e-21 3.184992e-21 3.810051e-21
## 4 1.35472 1.866524e-21 1.664043e-21 2.091981e-21
## 5 0.06211 1.405020e-22 1.139081e-22 1.728488e-22